Load Packages & Data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(dplyr)
library(stringr)
library(knitr)
books <- read_csv("data/books.csv")
## Rows: 11123 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): title, authors, isbn, isbn13, language_code, publication_date, publ...
## dbl (6): rowid, bookID, average_rating, num_pages, ratings_count, text_revie...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Exploring the Data

glimpse(books)
## Rows: 11,123
## Columns: 13
## $ rowid              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ bookID             <dbl> 1, 2, 4, 5, 8, 9, 10, 12, 13, 14, 16, 18, 21, 22, 2…
## $ title              <chr> "Harry Potter and the Half-Blood Prince (Harry Pott…
## $ authors            <chr> "J.K. Rowling/Mary GrandPré", "J.K. Rowling/Mary Gr…
## $ average_rating     <dbl> 4.57, 4.49, 4.42, 4.56, 4.78, 3.74, 4.73, 4.38, 4.3…
## $ isbn               <chr> "0439785960", "0439358078", "0439554896", "04396554…
## $ isbn13             <chr> "9780439785969", "9780439358071", "9780439554893", …
## $ language_code      <chr> "eng", "eng", "eng", "eng", "eng", "en-US", "eng", …
## $ num_pages          <dbl> 652, 870, 352, 435, 2690, 152, 3342, 815, 815, 215,…
## $ ratings_count      <dbl> 2095690, 2153167, 6333, 2339585, 41428, 19, 28242, …
## $ text_reviews_count <dbl> 27591, 29221, 244, 36325, 164, 1, 808, 254, 4080, 4…
## $ publication_date   <chr> "9/16/2006", "9/1/2004", "11/1/2003", "5/1/2004", "…
## $ publisher          <chr> "Scholastic Inc.", "Scholastic Inc.", "Scholastic",…

The variable names are:

names(books)
##  [1] "rowid"              "bookID"             "title"             
##  [4] "authors"            "average_rating"     "isbn"              
##  [7] "isbn13"             "language_code"      "num_pages"         
## [10] "ratings_count"      "text_reviews_count" "publication_date"  
## [13] "publisher"

Cleaning the Data

books_cleaned <- janitor::clean_names(books)

#glimpse(books)
books_cleaned %>% 
  summarise(across(rowid:publisher, ~sum(is.na(.x))))

There are no NA values!


Analysing the Data

Publisher:

What publisher has the most entries in the table?

most_titled_pub <- books_cleaned %>% 
  group_by(publisher) %>% 
  summarise(title_count = (n_distinct(title))) %>% 
  arrange(desc(title_count)) %>% 
  head(1) 

most_titled_pub
  • The publisher with the most titles in this data set is called “Vintage”.

What publisher has the best overall rating?

avg_overall_rating <- books_cleaned %>% 
  group_by(publisher) %>% 
  mutate(avg_overall_rating = (mean(average_rating))) %>% 
  arrange(desc(avg_overall_rating)) %>% 
  ungroup() %>% 
  select(publisher, avg_overall_rating)


avg_overall_rating
  • There are 12 publishers all with an average overall rating of 5. However, all of these publishers all only have 1 book that is rated so therefore the results could be scewed.

Are there any publishers that have published in >1 language?

books_cleaned %>% 
  group_by(publisher) %>% 
  summarise(language_count = n_distinct(language_code)) %>% 
  arrange(desc(language_count)) %>% 
  filter(language_count >1)
  • There are 507 publishers that have published in >1 language code, “Oxford University Press USA” have published in the most languages, which I would assume to be English, US English, British English, Spanish, Italian and greek:
books_cleaned %>% 
  select(publisher, language_code) %>% 
  filter(publisher == "Oxford University Press  USA") %>% 
  distinct(language_code, publisher) #to remove duplicate entries

Authors:

  • There are a lot of authors that are entered with their illustrator/ co-author, to look at lead authors only:
books_cleaned <- books_cleaned %>% 
  mutate(lead_author = gsub("/..*", "", authors), 
         .after = authors) %>% 
  arrange(desc(authors))

books_cleaned

What author has the highest overall rating?

max_rated_author <- books_cleaned %>% 
  group_by(lead_author) %>% 
  mutate(avg_rating = (mean(average_rating))) %>% 
  arrange(desc(avg_rating)) %>% 
  select(lead_author, average_rating)

max_rated_author
  • There are 17 authors with an average rating of 5.00.

What author has the most titles listed?

most_titled_author <- books_cleaned %>% 
  group_by(lead_author) %>% 
  summarise(title_count_author = (n_distinct(title))) %>% 
  arrange(desc(title_count_author)) %>% 
 head(1) 

most_titled_author
  • The author with the most titles in this data is Stephen King, who has 63 books listed in the table:
books_cleaned %>% 
  select(title, lead_author) %>% 
  filter(lead_author == "Stephen King") %>% 
  distinct(title, lead_author) #to remove duplicate entries

Are there any authors that have published in >1 language?

books_cleaned %>% 
  group_by(lead_author) %>% 
  summarise(language_count = n_distinct(language_code)) %>% 
  arrange(desc(language_count)) %>% 
  filter(language_count >1)
  • There are 721 authors that have published in >1 language, with J.K Rowling being the most at 7 languages. I assume these languages are Turkish, German, Latvian, English, Chinese, Spanish and Gaelic(?)
books_cleaned %>% 
  select(lead_author, language_code) %>% 
  filter(lead_author == "J.K. Rowling") %>% 
  distinct(language_code, lead_author) #to remove duplicate entries

Are there any authors that have >1 publisher?

author_max_publishers <- books_cleaned %>% 
  group_by(lead_author) %>% 
  summarise(publisher_count = n_distinct(publisher)) %>% 
  arrange(desc(publisher_count)) %>% 
  head(1) %>%  
  pull(lead_author)

author_max_publishers
## [1] "Stephen King"
  • Stephen King has the most publishers (46).

Annual Data:

books_years <- books_cleaned %>% 
  mutate(publication_year = str_sub(publication_date, -4),
         .after = publication_date) %>% 
  arrange(publication_year) 

books_years

What was the best book per year?

books_years %>% 
  mutate(publication_year = as.integer(publication_year)) %>% 
  select("publication_year", "title", "average_rating") %>% 
  arrange(publication_year, desc(average_rating)) %>% 
  group_by(publication_year) %>% 
  slice_max(average_rating, n=1)  
  • The table above shows the best book per year based on average review. For any years where there are multiple books of the same highest rating, they are all included.

How many books were rated in each year?

books_years %>% 
  group_by(publication_year) %>% 
  summarise(books_per_year = n_distinct(title)) %>% 
  arrange(publication_year) #%>% 
  #arrange(desc(books_per_year)) #to check year with most ratings
  • The table above shows the number of books per year.
  • The year with the highest number of books was 2006 with 1674 books reviewed.

An attempt to solve the duplicate publisher issue!

url_ISBN13 <- "https://blog-cdn.reedsy.com/directories/admin/attachments/large_How-to-read-an-ISBN-2d4aca.jpg"
include_graphics(url_ISBN13)

books_cleaned %>% 
  
  mutate(isbn13_count = nchar(isbn13)) %>% 
  filter(isbn13_count != 13)
books_isbn_pub <- books_cleaned %>% 
  mutate(isbn13_publisher = substr(isbn13, 5, 7),
         .after = publisher) %>% 
  arrange(publisher)
books_isbn_pub
books_isbn13_pub_grouped <- books_isbn_pub %>% 
  group_by(publisher) %>% 
  summarise(count_isbn_pub = n_distinct(isbn13_publisher)) %>% 
  arrange(desc(count_isbn_pub)) %>% 
  filter(count_isbn_pub  >1)

books_isbn13_pub_grouped
books_isbn_pub %>% 
  filter(publisher == "HarperCollins") %>% 
  arrange(isbn13_publisher)